In [6]:
# Standard imports. Note: You must pip install nasdaqdatalink 1st
import os
import pandas as pd
import hvplot.pandas
from pathlib import Path

# For API Calls
import nasdaqdatalink
# Do we need requests?
import requests
# For opening zip folder
import shutil 
# For technical analysis
import pandas_ta as ta
In [7]:
# Linking my API key to .env in the same folder. The key is stored in the folder without any quotations around it 

nasdaqdatalink.read_key(filename=".env")
In [8]:
# A function to retrieve a dataframe of counties, zips, etc
def get_regions(regions):
    region_df=nasdaqdatalink.get_table('ZILLOW/REGIONS', region_type=regions)  
    return region_df

1. Get the regions data from Zillow REST APIs.¶

This contains a list of all counties in the US.

In [9]:
# Using get_regions to retrieve a list of counties
region_df = get_regions('county')
region_df[["county", "state"]] = region_df["region"].str.split(';', 1, expand=True)
region_df["state"] = region_df["state"].str.split(';', 1, expand=True)[0]

#
# Clean up regions data
# Remove ' County' so that we can match the Zillow data with Wikipedia data.
region_df["county"] = region_df["county"].str.replace(" County", "")

# Remove the leading blank space from the 'state' column.
region_df["state"] = region_df['state'].str[1:]

# Clean up region_id datatype.
region_df['region_id']=region_df['region_id'].astype(int)

# Check data for region_df
print(region_df.head())
print(region_df.tail())
region_id region_type region county state
None
0 999 county Durham County; NC; Durham-Chapel Hill Durham NC
1 998 county Duplin County; NC Duplin NC
2 997 county Dubois County; IN; Jasper Dubois IN
3 995 county Donley County; TX Donley TX
4 993 county Dimmit County; TX Dimmit TX
region_id region_type region county state
None
2886 1003 county Elmore County; AL; Montgomery Elmore AL
2887 1002 county Elbert County; GA Elbert GA
2888 1001 county Elbert County; CO; Denver-Aurora-Lakewood Elbert CO
2889 1000 county Echols County; GA; Valdosta Echols GA
2890 100 county Bibb County; AL; Birmingham-Hoover Bibb AL

2. Get the Zillow sales data.¶

In this example, we read in Zillow sales data in the form of a CSV file.

In [10]:
# Get the Zillow sales data. 
# The actual API call using the SDK.
# Instructions can be found here https://data.nasdaq.com/databases/ZILLOW/usage/quickstart/python
# Replace 'quandl' w/ 'nasdaqdatalink
# Turned into a function to prevent constant re-downloading massive csv

def get_zillow_data():
    data = nasdaqdatalink.export_table('ZILLOW/DATA', indicator_id='ZSFH', region_id=list(region_df['region_id']),filename='db.zip')
    
    # Unzipping database from API call
    shutil.unpack_archive('db.zip')
    return data        
In [11]:
# Reading in Database
zillow_data=pd.read_csv(
    Path('ZILLOW_DATA_d5d2ff90eb7172dbde848ea36de12dfe.csv')
)

# Check the Zillow sales data
print(zillow_data.head())
print(zillow_data.tail())
indicator_id region_id date value
0 ZSFH 100 2007-11-30 123760.0
1 ZSFH 100 2007-12-31 123754.0
2 ZSFH 100 2008-01-31 123605.0
3 ZSFH 100 2008-02-29 123393.0
4 ZSFH 100 2008-03-31 123095.0
indicator_id region_id date value
669311 ZSFH 999 2022-02-28 390111.0
669312 ZSFH 999 2022-03-31 401621.0
669313 ZSFH 999 2022-04-30 411421.0
669314 ZSFH 999 2022-05-31 422028.0
669315 ZSFH 999 2022-06-30 430509.0
In [12]:
## Merge the Region dataframe with the Zillow sales data
zillow_merge_df = pd.merge(region_df, zillow_data, on=['region_id'])

# Check the merged Zillow data
zillow_merge_df.head()
Out[12]:
region_id region_type region county state indicator_id date value
0 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-02-28 139430.0
1 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-03-31 139459.0
2 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-04-30 139659.0
3 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-05-31 139887.0
4 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-06-30 140303.0

3. Get the county coordinates data.¶

We couldn't find the county coordinates from Zillow, so we sourced the data from Wikipedia. We are going to have to merge the data with Zillow based on county and state.

In [13]:
# Read in county data with coordinates
county_coordinates_df=pd.read_csv(
    Path('counties_w_coordinates.csv')
)

# Clean up data.
# We need to rename the columns so that we can merge our Zillow data set 
# with the county coordinates data.   The dataframes will be merged against 'county' and 'state'. 
county_coordinates_df = county_coordinates_df.rename(columns={"County\xa0[2]" : "county"})
# county_coordinates_df = county_coordinates_df.rename(columns={"region" : "region"})
county_coordinates_df = county_coordinates_df.rename(columns={"State" : "state"})

# Remove degrees 
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace("°", "")
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace("°", "")

# Remove + sign for Latitude and Longitude
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace("+", "")
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace("+", "")

# Some of the data uses unicode hyphens which causes problems when trying to convert the Longitude and Latitude to float.
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace('\U00002013', '-')
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace('\U00002013', '-')

# Convert Longitude and Latitude to float so we can display on the map. 
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].astype(float)
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].astype(float)

# Check the county coordinates data
county_coordinates_df.head()
C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:19: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
Out[13]:
Sort [1] state FIPS county County Seat(s) [3] Population Land Area Land Area.1 Water Area Water Area.1 Total Area Total Area.1 Latitude Longitude
0 NaN NaN NaN NaN NaN -2010 km² mi² km² mi² km² mi² NaN NaN
1 1.0 AL 1001.0 Autauga Prattville 54,571 1,539.58 594.436 25.776 9.952 1,565.36 604.388 32.536382 -86.644490
2 2.0 AL 1003.0 Baldwin Bay Minette 182,265 4,117.52 1,589.78 1,133.19 437.527 5,250.71 2,027.31 30.659218 -87.746067
3 3.0 AL 1005.0 Barbour Clayton 27,457 2,291.82 884.876 50.865 19.639 2,342.68 904.515 31.870670 -85.405456
4 4.0 AL 1007.0 Bibb Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148
In [14]:
# Merge the Zillow data and county coordinates data.
master_df = pd.merge(zillow_merge_df, county_coordinates_df, on=['county', 'state'])

master_df['date']=pd.to_datetime(master_df['date'])

# Check the master data
master_df
Out[14]:
region_id region_type region county state indicator_id date value Sort [1] FIPS County Seat(s) [3] Population Land Area Land Area.1 Water Area Water Area.1 Total Area Total Area.1 Latitude Longitude
0 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-02-28 139430.0 1922.0 37063.0 Durham 267,587 740.673 285.975 30.798 11.891 771.471 297.866 36.036589 -78.877919
1 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-03-31 139459.0 1922.0 37063.0 Durham 267,587 740.673 285.975 30.798 11.891 771.471 297.866 36.036589 -78.877919
2 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-04-30 139659.0 1922.0 37063.0 Durham 267,587 740.673 285.975 30.798 11.891 771.471 297.866 36.036589 -78.877919
3 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-05-31 139887.0 1922.0 37063.0 Durham 267,587 740.673 285.975 30.798 11.891 771.471 297.866 36.036589 -78.877919
4 999 county Durham County; NC; Durham-Chapel Hill Durham NC ZSFH 1997-06-30 140303.0 1922.0 37063.0 Durham 267,587 740.673 285.975 30.798 11.891 771.471 297.866 36.036589 -78.877919
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
628666 100 county Bibb County; AL; Birmingham-Hoover Bibb AL ZSFH 2022-02-28 161462.0 4.0 1007.0 Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148
628667 100 county Bibb County; AL; Birmingham-Hoover Bibb AL ZSFH 2022-03-31 162369.0 4.0 1007.0 Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148
628668 100 county Bibb County; AL; Birmingham-Hoover Bibb AL ZSFH 2022-04-30 163859.0 4.0 1007.0 Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148
628669 100 county Bibb County; AL; Birmingham-Hoover Bibb AL ZSFH 2022-05-31 164684.0 4.0 1007.0 Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148
628670 100 county Bibb County; AL; Birmingham-Hoover Bibb AL ZSFH 2022-06-30 166720.0 4.0 1007.0 Centreville 22,915 1,612.48 622.582 9.289 3.587 1,621.77 626.169 33.015893 -87.127148

628671 rows × 20 columns

4. Display in a Map¶

In [15]:
# Get mean data by state and county
county_df = master_df.groupby(["state", "county"]).mean()

# Divide price by 1000 so that it looks better on map.
county_df["value"] = county_df["value"] / 1000

# Check data
print(county_df.head())
print(county_df.tail())
region_id value Sort [1] FIPS Latitude Longitude
state county
AL Autauga 1524.0 144.376453 1.0 1001.0 32.536382 -86.644490
Baldwin 1525.0 172.793462 2.0 1003.0 30.659218 -87.746067
Barbour 1531.0 88.204607 3.0 1005.0 31.870670 -85.405456
Bibb 100.0 117.338341 4.0 1007.0 33.015893 -87.127148
Blount 883.0 126.394141 5.0 1009.0 33.977448 -86.567246
region_id value Sort [1] FIPS Latitude Longitude
state county
WY Sweetwater 3036.0 217.047598 3139.0 56037.0 41.660339 -108.875676
Teton 745.0 850.719510 3140.0 56039.0 44.049321 -110.588102
Uinta 760.0 183.118510 3141.0 56041.0 41.284726 -110.558947
Washakie 2593.0 162.015983 3142.0 56043.0 43.878831 -107.669052
Weston 1486.0 169.969867 3143.0 56045.0 43.846213 -104.570020
In [16]:
county_df.hvplot.points(
    'Longitude',
    'Latitude',
    geo=True,
    size='value',
    color='value',
    tiles='OSM',
    height=700,
    width=1200)
Out[16]:

The MAC/D¶

In [17]:
# Creates a DataFrame using only the columns we are interested in

filtered_df = master_df[['date','county','state','value']]

filtered_df['county'] = filtered_df['county'] + ", " + filtered_df['state']
drop_cols = ['state']
filtered_df = filtered_df.drop(columns=drop_cols) 
C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
In [18]:
# Figured out the change in number of counties was messing up the charts

exploratory_df=filtered_df.groupby('date').count()
In [19]:
# Create new DataFrame with summed county markets to represent the entire nation
nationwide_df = filtered_df.groupby(filtered_df['date']).agg({'value':'sum'})

# Must divide 'values' by number of counties that make up said value so data isn't skewed by county number
nationwide_df['avg'] = nationwide_df['value']/exploratory_df['county']
In [50]:
# Define a function for getting a nationwide MACD indicator using pandas_ta
def get_nationwide_macd(fast, slow, signal):
    nationwide_macd_df = nationwide_df.ta.macd(close='avg', fast=fast, slow=slow, signal=signal, append=True)
    # Making DataFrame look nice
    nationwide_macd_df = nationwide_macd_df.rename(columns={f'MACD_{fast}_{slow}_{signal}':'fast_ema',f'MACDh_{fast}_{slow}_{signal}':'signal',f'MACDs_{fast}_{slow}_{signal}':'slow_ema'}).dropna()
    # Divide by 1000 so it looks more like a momentum indicator
    nationwide_macd_df = nationwide_macd_df/1000
    return nationwide_macd_df
In [48]:
# Use newly defined funtion

nationwide_macd_df = get_nationwide_macd(6, 12, 4)
In [49]:
# Graph

nationwide_macd_df.hvplot(title='US Housing Market Momentum', ylabel='Momentum')
Out[49]:
In [58]:
# Show mean housing price in county

filtered_df.hvplot(title='Mean Single Famiy Home Price',groupby='county', x='date', yformatter='%.0f')
Out[58]:
In [62]:
# Define a function for getting a county-specific MACD indicator using pandas_ta
def get_county_macd(fast, slow, signal):
    
    county_macd_df=filtered_df.copy()
    
    county_macd_df.ta.macd(close='value', fast=fast, slow=slow, signal=signal, append=True)
    
    # Making DataFrame look nice
    county_macd_df = county_macd_df.rename(columns={f'MACD_{fast}_{slow}_{signal}':'fast_ema',f'MACDh_{fast}_{slow}_{signal}':'signal',f'MACDs_{fast}_{slow}_{signal}':'slow_ema'}).dropna()
    
    county_macd_df = county_macd_df.drop(columns='value').set_index('date')

    county_macd_df[['fast_ema','signal','slow_ema']] = county_macd_df[['fast_ema','signal','slow_ema']]/1000
    
    return county_macd_df
In [63]:
# Use newly defined function
county_macd_df=get_county_macd(6,12,4)
In [64]:
county_macd_df.hvplot(title='MAC/D by County', groupby='county', x='date', ylabel='Momentum')
Out[64]:
In [ ]: